In [1]:
# pip install wordcloud
In [2]:
# pip install missingno
In [ ]:
# pip install cufflinks
In [4]:
import numpy as np
import pandas as pd

# Basic Visualization tools
import matplotlib
import matplotlib.pyplot as plt
plt.style.use('ggplot')
plt.rcParams['figure.dpi'] = 300
import seaborn as sns
sns.set_palette('husl')

plt.rc('figure',figsize=(17,13))
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as pyo
from plotly.subplots import make_subplots

# Special Visualization
import wordcloud, missingno
from wordcloud import WordCloud # wordcloud
import missingno as msno # check missing value
#import networkx as nx

# Plotly visualization 
import plotly.offline as py
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
# Required to use plotly offline in jupyter notebook
py.init_notebook_mode(connected = True) 

#importing plotly and cufflinks in offline mode  
import cufflinks as cf 
import plotly.offline  
cf.go_offline()  
cf.set_config_file(offline=False, world_readable=True)

# Display markdown formatted output like bold, italic bold etc
from IPython.display import Markdown
def bold(string):
    display(Markdown(string))
    
import plotly.io as pio
pio.renderers.default='notebook'
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
In [5]:
data = pd.read_csv('bestsellers.csv')
data.head()
Out[5]:
Name Author User Rating Reviews Price Year Genre
0 10-Day Green Smoothie Cleanse JJ Smith 4.7 17350 8 2016 Non Fiction
1 11/22/63: A Novel Stephen King 4.6 2052 22 2011 Fiction
2 12 Rules for Life: An Antidote to Chaos Jordan B. Peterson 4.7 18979 15 2018 Non Fiction
3 1984 (Signet Classics) George Orwell 4.7 21424 6 2017 Fiction
4 5,000 Awesome Facts (About Everything!) (Natio... National Geographic Kids 4.8 7665 12 2019 Non Fiction
In [6]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550 entries, 0 to 549
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Name         550 non-null    object 
 1   Author       550 non-null    object 
 2   User Rating  550 non-null    float64
 3   Reviews      550 non-null    int64  
 4   Price        550 non-null    int64  
 5   Year         550 non-null    int64  
 6   Genre        550 non-null    object 
dtypes: float64(1), int64(3), object(3)
memory usage: 30.2+ KB
In [7]:
# show numerical columns
data.describe()
Out[7]:
User Rating Reviews Price Year
count 550.000000 550.000000 550.000000 550.000000
mean 4.618364 11953.281818 13.100000 2014.000000
std 0.226980 11731.132017 10.842262 3.165156
min 3.300000 37.000000 0.000000 2009.000000
25% 4.500000 4058.000000 7.000000 2011.000000
50% 4.700000 8580.000000 11.000000 2014.000000
75% 4.800000 17253.250000 16.000000 2017.000000
max 4.900000 87841.000000 105.000000 2019.000000
In [8]:
# show string type columns
data.describe(include=['O'])
Out[8]:
Name Author Genre
count 550 550 550
unique 351 248 2
top Publication Manual of the American Psychologic... Jeff Kinney Non Fiction
freq 10 12 310
In [9]:
data.describe(include='all')
Out[9]:
Name Author User Rating Reviews Price Year Genre
count 550 550 550.000000 550.000000 550.000000 550.000000 550
unique 351 248 NaN NaN NaN NaN 2
top Publication Manual of the American Psychologic... Jeff Kinney NaN NaN NaN NaN Non Fiction
freq 10 12 NaN NaN NaN NaN 310
mean NaN NaN 4.618364 11953.281818 13.100000 2014.000000 NaN
std NaN NaN 0.226980 11731.132017 10.842262 3.165156 NaN
min NaN NaN 3.300000 37.000000 0.000000 2009.000000 NaN
25% NaN NaN 4.500000 4058.000000 7.000000 2011.000000 NaN
50% NaN NaN 4.700000 8580.000000 11.000000 2014.000000 NaN
75% NaN NaN 4.800000 17253.250000 16.000000 2017.000000 NaN
max NaN NaN 4.900000 87841.000000 105.000000 2019.000000 NaN

Check missing data with missingno¶

In [10]:
msno.matrix(data)
Out[10]:
<Axes: >
No description has been provided for this image

Bestseller rating count¶

In [11]:
temp_df = data['User Rating'].value_counts().reset_index()

# create trace1
trace1 = go.Bar(x = temp_df.index, y = temp_df['User Rating'],
                marker = dict(color = 'rgb(255,165,0)',
                              line=dict(color='rgb(0,0,0)',width=1.5)))
layout = go.Layout(template= "plotly_dark",title = 'RATINGS COUNT' , 
                   xaxis = dict(title = 'Rating'), yaxis = dict(title = 'Count'))
fig = go.Figure(data = [trace1], layout = layout)
fig.show()
bold("**MOST OF THE RATINGS ARE IN THE RANGE OF 4.6 TO 4.8**")

MOST OF THE RATINGS ARE IN THE RANGE OF 4.6 TO 4.8

In [12]:
def pie_plot(cnt_srs, title):
    labels=cnt_srs.index
    values=cnt_srs.values
    trace = go.Pie(labels=labels, 
                   values=values, 
                   title=title, 
                   hoverinfo=['percent+value'], 
                   textinfo='percent',
                   textposition='inside',
                   hole=0.3,
                   showlegend=True,
                   marker=dict(colors=plt.cm.viridis_r(np.linspace(0, 1, 14)),
                               line=dict(color='#000000', width=2)  ) 
                  )
    return trace
py.iplot([pie_plot(data['User Rating'].value_counts(), 'Rating')])

Relationship between Price and User Rating¶

In [13]:
fig, ax = plt.subplots(1,1, figsize=(12, 7), dpi=72)
sns.regplot(data=data, x='Price', y='User Rating', ax=ax)
plt.show()
No description has been provided for this image

Relationship between Price and Reviews¶

In [14]:
fig, ax = plt.subplots(1,1, figsize=(12, 7), dpi=72)
sns.regplot(data=data, x='Price', y='Reviews', color='b', ax=ax)
plt.show()
No description has been provided for this image
In [ ]:
 
In [15]:
temp_df = data['Genre'].value_counts().reset_index()

trace1 = go.Bar(
                x = temp_df.index,
                y = temp_df['Genre'],
                marker = dict(color = 'rgb(255,165,0)',
                              line=dict(color='rgb(0,0,0)',width=1.5)))

layout = go.Layout(template= "plotly_dark", xaxis = dict(title = 'Genre'), 
                   yaxis = dict(title = 'Count'))
fig = go.Figure(data = [trace1], layout = layout)
fig.show()
In [16]:
def pie_plot(cnt_srs, colors, title):
    '''A Function To Plot Pie Plot using Plotly'''
    labels=cnt_srs.index
    values=cnt_srs.values
    trace = go.Pie(labels=labels, 
                   values=values, 
                   title=title, 
                   hoverinfo='percent+value', 
                   textinfo='percent',
                   textposition='inside',
                   hole=0.3,
                   showlegend=True,
                   marker=dict(colors=colors,
                               line=dict(color='#000000',
                                         width=2),  )
                  )
    return trace

bold("**NON-FICTION BESTSELLERS ARE MORE THAN FICTION**")
py.iplot([pie_plot(data['Genre'].value_counts(), ['magenta', 'cyan'], 'GENRE')])

NON-FICTION BESTSELLERS ARE MORE THAN FICTION

Bestseller count based on genre over the years¶

In [17]:
d1 = data[data["Genre"] == "Fiction"]
d2 = data[data["Genre"] == "Non Fiction"]

# Fiction
vc1 = pd.DataFrame(data={'count' : d1.Year.value_counts() })
vc1['percent'] = vc1['count'].apply(lambda x : 100*x/sum(vc1['count']))
vc1 = vc1.sort_index()

# Non-Fiction
vc2 = pd.DataFrame(data={'count' : d2.Year.value_counts() })
vc2['percent'] = vc2['count'].apply(lambda x : 100*x/sum(vc2['count']))
vc2 = vc2.sort_index()

trace1 = go.Scatter(x=vc1.index, y=vc1["count"], name="Fiction", 
                        marker=dict(color = 'rgb(249, 6, 6)',
                        line=dict(color='rgb(0,0,0)',width=1.5)))

trace2 = go.Scatter(x=vc2.index, y=vc2["count"], name="Non Fiction", 
                        marker= dict(color = 'rgb(26, 118, 255)',
                        line=dict(color='rgb(0,0,0)',width=1.5)))
layout = go.Layout(hovermode= 'closest', 
                   title = 'GENRE BASED BESTSELLERS COUNT OVER THE YEARS' , 
                   xaxis = dict(title = 'Year'), yaxis = dict(title = 'Count'),
                   template= "plotly_dark")
fig = go.Figure(data = [trace1, trace2], layout=layout)
fig.show()

Rating on genre¶

In [18]:
df1 = data[data["Genre"] == "Fiction"]
df2 = data[data["Genre"] == "Non Fiction"]

temp_df1 = pd.DataFrame(data={'Rating':df1['User Rating'].value_counts()}).sort_index()
temp_df2 = pd.DataFrame(data={'Rating':df2['User Rating'].value_counts()}).sort_index()

trace1 = go.Bar(x = temp_df1.index, y = temp_df1['Rating'],
                name="Fiction", marker = dict(color = 'rgb(249, 6, 6)',
                             line=dict(color='rgb(0,0,0)',width=1.5)))
# create trace2 
trace2 = go.Bar(x = temp_df2.index, y = temp_df2['Rating'],
            name = "Non Fiction", marker = dict(color = 'rgb(26, 118, 255)',
                         line=dict(color='rgb(0,0,0)',width=1.5)))

layout = go.Layout(template= "plotly_dark",title = 'RATING BY GENRE' , 
                   xaxis = dict(title = 'Rating'), yaxis = dict(title = 'Count'))
fig = go.Figure(data = [trace1, trace2], layout = layout)
fig.show()

Top 9 multiple times bestsellers¶

In [19]:
common_books = data['Name'].value_counts()[:9].rename_axis('Common Books')\
.reset_index(name='count')

fig = px.treemap(common_books, path=['Common Books'], 
                 values='count',title='TOP 9 MULTIPLE TIMES BESTSELLERS')

fig.show()

TOP 10 AUTHORS WITH MOST BESTSELLERS¶

In [20]:
temp_df1 = data.groupby('Author').count().reset_index().sort_values('Name',ascending=False).head(10)

trace1 = go.Bar(
                x = temp_df1['Author'],
                y = temp_df1['Name'],
                marker = dict(color = 'rgb(255,165,0)',
                             line=dict(color='rgb(0,0,0)',width=1.5)))
layout = go.Layout(template= "plotly_dark",title = 'TOP 10 AUTHORS WITH MOST BESTSELLERS ' , 
                   xaxis = dict(title = 'Author',tickangle=45), yaxis = dict(title = 'COUNT'))
fig = go.Figure(data = [trace1], layout = layout)
fig.show()

AUTHORS WITH BESTSELLERS IN BOTH THE GENRES:FICTION AND NON-FICTION¶

In [21]:
data[data['Author'].isin(list(data.groupby(['Author','Genre']).count().reset_index()\
                              ['Author'].value_counts()[:2].index))]
Out[21]:
Name Author User Rating Reviews Price Year Genre
28 Baby Touch and Feel: Animals DK 4.6 5360 5 2015 Non Fiction
158 Harry Potter Coloring Book Scholastic 4.7 3564 9 2015 Non Fiction
268 Pokémon Deluxe Essential Handbook: The Need-to... Scholastic 4.7 3503 9 2016 Fiction
514 Ultimate Sticker Book: Frozen: More Than 60 Re... DK 4.5 2586 5 2014 Fiction

Top 3 authors with highest bestsellers in 2019¶

In [22]:
temp_df1=data[data['Year']==2019].groupby('Author').count().reset_index()\
.sort_values('Name',ascending=False).head(3)

trace1 = go.Bar(
                x = temp_df1['Author'],
                y = temp_df1['Name'],
                marker = dict(color = 'rgb(255,165,0)',
                             line=dict(color='rgb(0,0,0)',width=1.5)))
layout = go.Layout(template= "plotly_dark",title = 'TOP 3 AUTHORS WITH HIGHEST BESTSELLERS IN 2019 ' , 
                   xaxis = dict(title = 'Author'), yaxis = dict(title = 'COUNT'))
fig = go.Figure(data = [trace1], layout = layout)
fig.show()
In [23]:
temp_df1=data[data['Year']==2019].groupby('User Rating').count().reset_index()\
.sort_values('Name',ascending=False)

trace1 = go.Bar(
                x = temp_df1['User Rating'],
                y = temp_df1['Name'],
                marker = dict(color = 'rgb(255,165,0)',
                             line=dict(color='rgb(0,0,0)',width=1.5)))
layout = go.Layout(template= "plotly_dark",title = 'Ratings In 2019 ' , 
                   xaxis = dict(title = 'Ratings'), yaxis = dict(title = 'COUNT'))
fig = go.Figure(data = [trace1], layout = layout)
fig.show()

create new dataframes based on the average rating of authors, the number of books written by authors, the number of reviews for books and the number of books by genre. We visualize the received information.¶

In [24]:
# We take all authors with the highest average rating
top_13_autors = data.groupby('Author')[['User Rating']].mean()\
                                                              .sort_values('User Rating', ascending=False)\
                                                              .head(13)\
                                                              .reset_index()
# We take the authors who have written the most bestsellers
number_of_books_written = data.groupby('Author')[['Name']].count()\
                                                                 .sort_values('Name', ascending=False)\
                                                                 .head(10)\
                                                                 .reset_index()

# We take books with the number of reviews
books_with_the_number_of_reviews = data.groupby('Name')[['Reviews']].sum()\
                                                                           .sort_values('Reviews', ascending=False)\
                                                                           .head(5)\
                                                                           .reset_index()

# We take the number of books by genre
number_of_books_by_genre = data.groupby('Genre')[['Name']].count()\
                                                                 .sort_values('Name', ascending=False)\
                                                                 .head(10)\
                                                                 .reset_index()
In [25]:
# Let's set the visualization parameters
fig_1 = make_subplots(rows=1, cols=1, subplot_titles=("")) # Let's set the number of rows and columns


# Setting table parameters
fig_1.add_trace(go.Table(columnorder = [1,2],
                         columnwidth = [100,50],
                         header=dict(values=top_13_autors.columns, 
                                     line_color='darkslategray',
                                     fill_color='Salmon',
                                     height=30),
                         cells=dict(values=[top_13_autors['Author'],
                                    top_13_autors['User Rating']], 
                                    line_color='darkslategray',
                                    fill_color='White')))

# Setting the parameters of the chart when displaying
fig_1.update_layout(showlegend=False, 
                    title_text='Top 13 best authors', 
                    title_font_size=16, 
                    title_font_family='Arial', 
                    title_x=0.5,
                    font=dict(family='Arial', 
                              size=12, 
                              color='black'))

# Displaying the graph
fig_1.show()

# Let's set the visualization parameters
fig_2 = make_subplots(rows=1, cols=2,
                      specs=[[{'type': 'xy'}, {"type": "xy"}]],
                      subplot_titles=("Top 10 authors by number of bestsellers", 
                                      "Top 5 books by number of reviews"))

# Setting Bar parameters
fig_2.add_trace(go.Bar(x=number_of_books_written['Author'], 
                       y=number_of_books_written['Name'],
                       name ='Count books',
                       marker_color=['#ff6666','#f76e6e', '#f07575', '#e87d7d', '#e08585',
                                     '#d98c8c', '#d19494', '#c99c9c', '#c2a3a3', '#baabab']),
                       row=1, col=1)

# Setting Bar parameters
fig_2.add_trace(go.Bar(x=books_with_the_number_of_reviews['Name'], 
                       y=books_with_the_number_of_reviews['Reviews'],
                       marker_color=['#ff6666','#f47171', '#e87d7d', '#dd8888', '#d19494'],
                       name ='Reviews'),
                       row=1, col=2)

# Setting the parameters of the chart when displaying
fig_2.update_traces(marker_line_color='Gray',
                    marker_line_width=1)

# Setting the parameters of the chart when displaying
fig_2.update_layout(showlegend=False, 
                    plot_bgcolor='rgba(0,0,0,0)',
                    font=dict(family='Arial', 
                              size=12, 
                              color='black'))

# Displaying the graph
fig_2.show()

# Let's set the visualization parameters
fig_3 = make_subplots(rows=1, cols=1,
                      specs=[[{'type': 'domain'}]],
                      subplot_titles=(""))

# Setting Pie parameters
fig_3.add_trace(go.Pie(values=number_of_books_by_genre['Name'],
                       title='Genre',
                       labels=['Non Fiction', 'Fiction'],
                       textinfo='label+percent',
                       insidetextorientation='radial',
                       hole=0.3,
                       marker=dict(colors=['Gray','Salmon'], 
                                   line_color='Gray', 
                                   line_width=1),
                       textfont={'color': '#FFFFFF', 'size': 12},
                       textfont_size=12))

# Setting the parameters of the chart when displaying
fig_3.update_layout(margin_t=30, 
                    showlegend=False, 
                    title_text='Count of books genre', 
                    title_x=0.5, 
                    font=dict(family='Arial', 
                              size=12, 
                              color='black'))

# Displaying the graph
fig_3.show()
In [ ]: